"""build_basic_algos_dataset.py — generate 100 000 basic‑algorithm examples

Tasks (20 000 each): add, subtract, max, min, sort
Outputs:
  data/train.jsonl (90 000 rows)
  data/valid.jsonl (10 000 rows)
Each row:
  {
    "skill": "add",
    "prompt": "Add 5 and 7.",
    "code": "5 + 7",
    "tagged_prompt": "<INT>...",
    "tagged_code": "<INT>..."
  }
Requires python_type_tokenizer.py in the same directory or PYTHONPATH.
"""

import json, random, pathlib, ast, sys
from python_type_tokenizer import PyTypeTokenizer

tok = PyTypeTokenizer()

DATA_DIR = pathlib.Path("data")
DATA_DIR.mkdir(exist_ok=True)

def ri(a=-99,b=99): return random.randint(a,b)
def rlist(): return random.sample(range(-50,51), k=random.randint(4,8))

def g_add():
    a,b = ri(),ri()
    return "add", f"Add {a} and {b}.", f"{a} + {b}"
def g_sub():
    a,b = ri(),ri()
    return "sub", f"Subtract {b} from {a}.", f"{a} - {b}"
def g_max():
    lst=rlist(); return "max", f"Find the maximum of {lst}.", f"max({lst})"
def g_min():
    lst=rlist(); return "min", f"Find the minimum of {lst}.", f"min({lst})"
def g_sort():
    lst=rlist(); return "sort", f"Sort the list {lst}.", f"sorted({lst})"

TASKS=[g_add,g_sub,g_max,g_min,g_sort]

records=[]
while len(records)<100_000:
    skill,prompt,code = random.choice(TASKS)()
    try:
        _=eval(code)
        records.append({
            "skill":skill,
            "prompt":prompt,
            "code":code,
            "tagged_prompt":tok.tag_text(prompt),
            "tagged_code":tok.tag_text(code)
        })
    except Exception:
        pass

random.shuffle(records)
split=int(0.9*len(records))
train,valid=records[:split],records[split:]

def dump(path,data):
    with open(path,'w') as f:
        for r in data:
            json.dump(r,f); f.write("\n")

dump(DATA_DIR/"train.jsonl",train)
dump(DATA_DIR/"valid.jsonl",valid)

print(f"Wrote {len(train)} train and {len(valid)} valid examples to {DATA_DIR}")